- create a table per each chart set
- group queries by sub-type (sorting.. merging)
- chanrts with two cols
library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0 v purrr 0.3.2
## v tibble 2.1.1 v dplyr 0.8.0.1
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## Warning: package 'tibble' was built under R version 3.5.3
## Warning: package 'tidyr' was built under R version 3.5.3
## Warning: package 'purrr' was built under R version 3.5.3
## Warning: package 'forcats' was built under R version 3.5.3
## -- Conflicts ---------------------------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(DT)
library(htmltools)
Setup
options = params$options
targetVar = params$targetVar
targetUnits = ifelse(targetVar=='Throughput','MB/s','secs')
showOutliers = params$showOutliers
outliersQuantile = ifelse(showOutliers, 1 ,0.90) #use 1.0 to include all outliers
outliersShape = ifelse(showOutliers, 19,NA)
plotCols=2
Reading Files
read.clean.files = function(filename){
file = read.csv(filename, header = FALSE)
column.names = c("Language","Randomize","Dataset","MachineID","RunID","Type","Operation","TimeTaken")
colnames(file) = column.names
return(file)
}
files = list.files(path = "../Results/", pattern = ".csv$", recursive = TRUE, full.names = TRUE) # List all .csv files
#files
databricks.files = files[grepl("Databricks",files)]
local.vm..files = files[grepl("Local_VM",files)]
cluster.files = files[grepl("Cluster",files)]
rows.databricks = lapply(databricks.files, read.csv, header = FALSE) # Read the files into list
merged.databricks = do.call(rbind, rows.databricks) # combine the data.frame
merged.databricks$Setup = 'Databricks'
rows.local.vm = lapply(local.vm..files, read.csv, header = FALSE) # Read the files into list
merged.local.vm = do.call(rbind, rows.local.vm) # combine the data.frame
merged.local.vm$Setup = 'Local VM'
rows.cluster = lapply(cluster.files, read.csv, header = FALSE) # Read the files into list
merged.cluster = do.call(rbind, rows.cluster) # combine the data.frame
merged.cluster$Setup = 'Cluster'
merged_data = rbind(merged.databricks,merged.local.vm,merged.cluster)
merged_data$Setup = as.factor(merged_data$Setup)
column.names = c("Language","Randomize","Dataset","MachineID","RunID","Type","Operation","TimeTaken","Setup")
colnames(merged_data) = column.names
merged_data$Type = as.factor(gsub(pattern = "Operations", replacement = "Operation", x = merged_data$Type))
# Convert columns to factors
merged_data$MachineID = as.factor(merged_data$MachineID)
merged_data$Randomize = as.factor(merged_data$Randomize)
merged_data$RunID = as.factor(merged_data$RunID)
merged_data$Dataset = sub("dataset_", "", merged_data$Dataset)
merged_data$Dataset = sub("MB$", "", merged_data$Dataset)
merged_data$Dataset = as.factor(merged_data$Dataset)
merged_data$Operation = trimws(as.character(merged_data$Operation),'both')
merged_data[merged_data$Operation =='Mathematical Operations on Columns',]$Operation ='Mathematical Operation on Columns'
merged_data$subType = ''
merged_data[grepl('^Sorting',merged_data$Operation),]$subType = 'Sorting'
merged_data[grepl('^Ranking',merged_data$Operation),]$subType = 'Ranking'
merged_data[grepl('^Split',merged_data$Operation),]$subType = 'Splitting'
merged_data[grepl('^GroupBy',merged_data$Operation),]$subType = 'Grouping'
merged_data[grepl(' Join ',merged_data$Operation),]$subType = 'Joining'
merged_data[grepl('^Merge',merged_data$Operation),]$subType = 'Merging'
merged_data[grepl('^Filter',merged_data$Operation),]$subType = 'Filtering'
merged_data[grepl('^Mathematical',merged_data$Operation),]$subType = 'Mathematics'
merged_data[grepl('^Pivot',merged_data$Operation),]$subType = 'Pivots'
merged_data[grepl('^Running|^Shift',merged_data$Operation),]$subType = 'Run/Shift'
merged_data[grepl('^Writing',merged_data$Operation),]$subType = 'Writing'
size_10MB = 11.4789848327637 # file.size("../../Data/Databricks/machine2/dataset_10MB.csv")/(1024*1024)
size_100MB = 115.640992164612 # file.size("../../Data/Databricks/machine2/dataset_100MB.csv")/(1024*1024)
size_200MB = 229.8573
size_300MB = 343.2709
size_500MB = 576.678165435791 # file.size("../../Data/Databricks/machine2/dataset_500MB.csv")/(1024*1024)
print(paste("Actual Size of 10MB file (in MB)",size_10MB))
## [1] "Actual Size of 10MB file (in MB) 11.4789848327637"
print(paste("Actual Size of 100MB file (in MB)",size_100MB))
## [1] "Actual Size of 100MB file (in MB) 115.640992164612"
print(paste("Actual Size of 200MB file (in MB)",size_200MB))
## [1] "Actual Size of 200MB file (in MB) 229.8573"
print(paste("Actual Size of 300MB file (in MB)",size_300MB))
## [1] "Actual Size of 300MB file (in MB) 343.2709"
print(paste("Actual Size of 500MB file (in MB)",size_500MB))
## [1] "Actual Size of 500MB file (in MB) 576.678165435791"
size_info = data.frame(Dataset = c("10","100","200","300","500")
,Size = c(size_10MB,size_100MB,size_200MB,size_300MB,size_500MB))
str(size_info)
## 'data.frame': 5 obs. of 2 variables:
## $ Dataset: Factor w/ 5 levels "10","100","200",..: 1 2 3 4 5
## $ Size : num 11.5 115.6 229.9 343.3 576.7
merged_data = merged_data %>%
merge(size_info,by='Dataset') %>%
mutate(Throughput = Size/TimeTaken)
Setups = c(unique(as.character(merged_data$Setup)))
data_raw = merged_data
merged_data = merged_data %>%
filter(RunID != 1)
str(merged_data)
## 'data.frame': 7397 obs. of 12 variables:
## $ Dataset : Factor w/ 5 levels "10","100","200",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Language : Factor w/ 2 levels "PySpark","Scala": 2 1 2 2 2 1 1 1 1 2 ...
## $ Randomize : Factor w/ 1 level "1": 1 1 1 1 1 1 1 1 1 1 ...
## $ MachineID : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ RunID : Factor w/ 6 levels "1","2","3","4",..: 5 6 5 5 5 6 6 6 6 4 ...
## $ Type : Factor w/ 4 levels "Aggregate Operation",..: 2 2 1 2 1 2 2 2 2 2 ...
## $ Operation : chr "Sorting Asc 1 column" "Sorting Desc 5 column" "Ranking by Group" "Split 1 Column into 5" ...
## $ TimeTaken : num 1.88 4.36 1.88 1.04 12.38 ...
## $ Setup : Factor w/ 3 levels "Cluster","Databricks",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ subType : chr "Sorting" "Sorting" "Ranking" "Splitting" ...
## $ Size : num 11.5 11.5 11.5 11.5 11.5 ...
## $ Throughput: num 6.093 2.635 6.103 11.08 0.927 ...
head(merged_data)
## Dataset Language Randomize MachineID RunID Type
## 1 10 Scala 1 1 5 Column Operation
## 2 10 PySpark 1 1 6 Column Operation
## 3 10 Scala 1 1 5 Aggregate Operation
## 4 10 Scala 1 1 5 Column Operation
## 5 10 Scala 1 1 5 Aggregate Operation
## 6 10 PySpark 1 1 6 Column Operation
## Operation TimeTaken Setup subType Size
## 1 Sorting Asc 1 column 1.884000 Databricks Sorting 11.47898
## 2 Sorting Desc 5 column 4.355605 Databricks Sorting 11.47898
## 3 Ranking by Group 1.881000 Databricks Ranking 11.47898
## 4 Split 1 Column into 5 1.036000 Databricks Splitting 11.47898
## 5 GroupBy 5 columns 12.381000 Databricks Grouping 11.47898
## 6 Full Outer Join 5 Columns 9.544184 Databricks Joining 11.47898
## Throughput
## 1 6.0928794
## 2 2.6354514
## 3 6.1025969
## 4 11.0801012
## 5 0.9271452
## 6 1.2027204
summary(merged_data)
## Dataset Language Randomize MachineID RunID
## 10 :1740 PySpark:3737 1:7397 1:4397 1: 0
## 100:1620 Scala :3660 2:3000 2:1698
## 200:1350 3:1671
## 300:1337 4:1455
## 500:1350 5:1293
## 6:1280
## Type Operation TimeTaken
## Aggregate Operation: 998 Length:7397 Min. : 0.230
## Column Operation :3654 Class :character 1st Qu.: 3.161
## Mixed Operation : 749 Mode :character Median : 10.261
## Row Operation :1996 Mean : 25.683
## 3rd Qu.: 26.573
## Max. :631.088
## Setup subType Size Throughput
## Cluster :1500 Length:7397 Min. : 11.48 Min. : 0.273
## Databricks:2972 Class :character 1st Qu.:115.64 1st Qu.: 5.916
## Local VM :2925 Mode :character Median :229.86 Median : 13.489
## Mean :237.27 Mean : 25.278
## 3rd Qu.:343.27 3rd Qu.: 26.895
## Max. :576.68 Max. :268.098
print(!(.Platform$GUI == "RStudio"))
## [1] TRUE
Common Functions
subchunkify_clear <- function() {
files=dir(path='figure/',pattern='sub_chunk_',include.dirs=T,full.names = T)
all(file.remove(files))
}
subchunkify <- function(g, fig_height=7, fig_width=5, enabled = !(.Platform$GUI == "RStudio")) {
if (enabled==FALSE) return(plot(g))
g_deparsed <- paste0(deparse(
function() {g}
), collapse = '')
sub_chunk <- paste0("
`","``{r sub_chunk_", floor(runif(1) * 10000), ", fig.height=", fig_height, ", fig.width=", fig_width, ", echo=FALSE}",
"\n(",
g_deparsed
, ")()",
"\n`","``
")
cat(knitr::knit(text = knitr::knit_expand(text = sub_chunk), quiet = TRUE))
}
ggplot_colors = function(plot,strip_angle=0,...){
plot +
scale_color_manual(values=c("#ca0020","#0571b0"),breaks=c('PySpark','Scala'))+
scale_fill_manual(values=c("#f4a582","#92c5de"),breaks=c('PySpark','Scala'))+
theme_light() +
ggplot2::theme(strip.text.y=element_text(angle=strip_angle)
,...)
}
Simple Linear Regression Model
using a simple LM we can identify the elements that contribute to the speed of the queries, we see Scala has a coeffiecnet of -7, meaning that it bring an overall benefits of 7 seconds, keeping constant the other variables.
subchunkify_clear()
## [1] TRUE
form=as.formula(paste0(targetVar,' ~ Dataset + Language + Operation + Setup'))
model = lm(data=merged_data,formula=form)
summary(model)
##
## Call:
## lm(formula = form, data = merged_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -89.659 -8.926 -0.922 7.598 191.520
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 49.8435 1.5592 31.967
## Dataset100 17.2561 0.7583 22.758
## Dataset200 19.5960 0.8131 24.099
## Dataset300 18.6564 0.8152 22.885
## Dataset500 19.1115 0.8131 23.504
## LanguageScala 10.2988 0.5024 20.501
## OperationFilter Reg Ex 1 15.3760 1.9341 7.950
## OperationFilter Reg Ex 2 15.8812 1.9341 8.211
## OperationFull Outer Join 10 Columns -37.2898 2.6739 -13.946
## OperationFull Outer Join 3 Columns -32.3872 3.7101 -8.729
## OperationFull Outer Join 5 Columns -39.3983 2.6739 -14.734
## OperationGroupBy 1 column -39.8436 1.9341 -20.601
## OperationGroupBy 10 columns -45.3896 1.9321 -23.492
## OperationGroupBy 5 columns -43.2951 1.9341 -22.386
## OperationInner Join 10 Columns -34.9459 2.6739 -13.069
## OperationInner Join 3 Columns -31.1775 3.7101 -8.403
## OperationInner Join 5 Columns -36.2929 2.6739 -13.573
## OperationLeft Outer Join 10 Columns -35.4052 2.6739 -13.241
## OperationLeft Outer Join 3 Columns -31.1768 3.7101 -8.403
## OperationLeft Outer Join 5 Columns -37.9073 2.6739 -14.177
## OperationMathematical Operation on Columns 55.6608 1.9341 28.779
## OperationMerge 10 columns into 1 -28.7629 1.9341 -14.872
## OperationMerge 2 columns into 1 -26.8238 1.9321 -13.883
## OperationMerge 5 columns into 1 -27.7477 1.9321 -14.361
## OperationPivot 1 Rows and 1 Column -11.1919 1.9321 -5.793
## OperationPivot 10 Rows and 1 Column -33.2620 1.9321 -17.215
## OperationPivot 5 Rows and 1 Column -29.5658 1.9341 -15.287
## OperationRanking by Group -33.1435 1.9321 -17.154
## OperationRunning Sum 5.1520 1.9321 2.666
## OperationShift (Lag) 8.3843 1.9321 4.339
## OperationSorting Asc 1 column -39.3792 1.9321 -20.381
## OperationSorting Asc 10 column -39.9519 1.9321 -20.678
## OperationSorting Asc 5 column -39.5402 1.9341 -20.444
## OperationSorting Desc 1 column -39.3327 1.9341 -20.337
## OperationSorting Desc 10 column -39.8533 1.9341 -20.606
## OperationSorting Desc 5 column -39.5264 1.9341 -20.437
## OperationSplit 1 Column into 10 -33.4359 1.9321 -17.305
## OperationSplit 1 Column into 5 -30.8942 1.9321 -15.990
## OperationWriting 100 new rows -23.8030 1.9321 -12.320
## OperationWriting 1000 new rows -24.2613 1.9341 -12.544
## OperationWriting 10000 new rows -26.6119 1.9321 -13.773
## SetupDatabricks -22.5190 0.6836 -32.943
## SetupLocal VM -29.1892 0.6857 -42.570
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## Dataset100 < 2e-16 ***
## Dataset200 < 2e-16 ***
## Dataset300 < 2e-16 ***
## Dataset500 < 2e-16 ***
## LanguageScala < 2e-16 ***
## OperationFilter Reg Ex 1 2.14e-15 ***
## OperationFilter Reg Ex 2 2.56e-16 ***
## OperationFull Outer Join 10 Columns < 2e-16 ***
## OperationFull Outer Join 3 Columns < 2e-16 ***
## OperationFull Outer Join 5 Columns < 2e-16 ***
## OperationGroupBy 1 column < 2e-16 ***
## OperationGroupBy 10 columns < 2e-16 ***
## OperationGroupBy 5 columns < 2e-16 ***
## OperationInner Join 10 Columns < 2e-16 ***
## OperationInner Join 3 Columns < 2e-16 ***
## OperationInner Join 5 Columns < 2e-16 ***
## OperationLeft Outer Join 10 Columns < 2e-16 ***
## OperationLeft Outer Join 3 Columns < 2e-16 ***
## OperationLeft Outer Join 5 Columns < 2e-16 ***
## OperationMathematical Operation on Columns < 2e-16 ***
## OperationMerge 10 columns into 1 < 2e-16 ***
## OperationMerge 2 columns into 1 < 2e-16 ***
## OperationMerge 5 columns into 1 < 2e-16 ***
## OperationPivot 1 Rows and 1 Column 7.22e-09 ***
## OperationPivot 10 Rows and 1 Column < 2e-16 ***
## OperationPivot 5 Rows and 1 Column < 2e-16 ***
## OperationRanking by Group < 2e-16 ***
## OperationRunning Sum 0.00768 **
## OperationShift (Lag) 1.45e-05 ***
## OperationSorting Asc 1 column < 2e-16 ***
## OperationSorting Asc 10 column < 2e-16 ***
## OperationSorting Asc 5 column < 2e-16 ***
## OperationSorting Desc 1 column < 2e-16 ***
## OperationSorting Desc 10 column < 2e-16 ***
## OperationSorting Desc 5 column < 2e-16 ***
## OperationSplit 1 Column into 10 < 2e-16 ***
## OperationSplit 1 Column into 5 < 2e-16 ***
## OperationWriting 100 new rows < 2e-16 ***
## OperationWriting 1000 new rows < 2e-16 ***
## OperationWriting 10000 new rows < 2e-16 ***
## SetupDatabricks < 2e-16 ***
## SetupLocal VM < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21.58 on 7354 degrees of freedom
## Multiple R-squared: 0.6159, Adjusted R-squared: 0.6137
## F-statistic: 280.8 on 42 and 7354 DF, p-value: < 2.2e-16
Comparing of Environments
Comparing Local VM vs. Cloud similar in configuration, controlling by the type operation. The cloud machine is optimized for Scala and Spark, while the local machine has a standard installation of the softwars.
We notice the DataBricks environment benefit the aggregate operation, and that Databricks beenfits the scala encironemtn for the Row Operations. Anyway, other sitation shows a similar perfomance across lancuages and environment
filtered= merged_data
p=ggplot(data=filtered, aes_string(x='Setup',y=targetVar,color='Language',fill='Language')) +
geom_boxplot(outlier.size = 0.1,size=0.1,outlier.shape = outliersShape) +
scale_y_continuous(limits = c(0, quantile(filtered[[targetVar]],probs=outliersQuantile))
,labels=scales::comma_format(accuracy=1,suffix=targetUnits)) +
facet_wrap(~Type,scales='free',ncol=plotCols)
ggplot_colors(p,axis.text.y=element_text(size=7),legend.position='right')
## Warning: Removed 740 rows containing non-finite values (stat_boxplot).

Comparing different data sizes
Comparing the peromance of the differnt queries among different sizes of dataset. We notice that the increased size has an import impact on the time. PySpark is suffering more by the increased size, mainly for Row and Column Operations, while Scala language shows a better performance. We can see
plotDataSize=function(filtered,title){
t = filtered
p=ggplot(data=t, aes_string(x='Dataset',y=targetVar,color='Language',fill='Language')) +
geom_boxplot(outlier.size = 0.1,size=0.1,outlier.shape=outliersShape) +
scale_y_continuous(limits = c(0, quantile(filtered[[targetVar]],probs=outliersQuantile))
,labels=scales::comma_format(accuracy=1,suffix=targetUnits)) +
facet_wrap(~Type ,labeller=label_wrap_gen(width=100),scales='free_x',ncol=plotCols) +
ggplot2::ggtitle(title)
ggplot_colors(p,strip_angle=0
,axis.text.y=element_text(size=7)
,axis.text.x=element_text(size=9))
}
All Setups
filtered= merged_data
plotDataSize(filtered,'Comparing Datsets - All Environments')
## Warning: Removed 740 rows containing non-finite values (stat_boxplot).

By Setup
for (s in Setups) {
message(s)
filtered= merged_data %>% filter(Setup==s)
plot(plotDataSize(filtered,paste0('Comparing Datsets - ',s)))
}
## Databricks
## Warning: Removed 298 rows containing non-finite values (stat_boxplot).
## Local VM

## Warning: Removed 293 rows containing non-finite values (stat_boxplot).
## Cluster

## Warning: Removed 150 rows containing non-finite values (stat_boxplot).

Comparing Query Types
As expected, the slowest queries are the Aggregate one,while the fastest ones are the Row Operations.Scala outperform PySpark on the row operations, while we don’t see a significant differnece on the other query types
filtered=merged_data
filtered$Type = forcats::fct_reorder(filtered$Type,filtered[[targetVar]],.fun=median,.desc=F)
p=ggplot(data=filtered, aes_string(x='Type',y=targetVar,color='Language',fill='Language')) +
geom_boxplot(outlier.size = 0.1,size=0.1,outlier.shape=outliersShape) +
scale_y_continuous(limits = c(0, quantile(filtered[[targetVar]],probs=outliersQuantile))
,labels=scales::comma_format(accuracy=1,suffix=targetUnits)) +
facet_wrap(~Setup,ncol=3) +
ggplot2::coord_flip()
ggplot_colors(p,axis.text.x=element_text(size=8)
,panel.spacing.x=unit(10,'points')
,legend.position='bottom')
## Warning: Removed 740 rows containing non-finite values (stat_boxplot).

Comparing Queries
#functions
plot_get_significant = function(filtered){
w<-function(oper){wilcox.test(as.formula(paste0(targetVar,' ~ Language ')),alternative='two.sided'
,filtered,subset = Operation == oper)$p.value}
ops = unique(filtered$Operation)
test=sapply(ops,w)
keep= ops[which(test < 0.025)]
data2 = filtered[filtered$Operation %in% keep,]
if(nrow(data2)==0) {message("No Significant Elements") ; return(NULL)}
return(data2)
}
plot_operations_get = function(filtered,title='',subtitle='',ncol = plotCols){
if(is.null(filtered)) return(ggplot(data=filtered)+geom_blank())
plotData = filtered %>% mutate(Operation = factor(Operation))
p= ggplot(data=plotData, aes_string(x='Dataset',y=targetVar,color='Language')) +
ggplot2::stat_summary(aes(group=Language),fun.y=median,geom='line',size=.5 )+
geom_jitter(alpha=0.5,size=.5,width=0.1)+
scale_y_continuous(labels=scales::comma_format(accuracy=1,suffix=targetUnits)) +
facet_wrap(~ Operation,scales='free',ncol=ncol,labeller = label_wrap_gen(width = round(100/ncol))) +
#facet_grid(Operation~subType,scales='free',labeller = label_wrap_gen()) +
ggplot2::ggtitle(title,subtitle=subtitle)
ggplot_colors(p,legend.position = 'bottom'
,axis.text.x=element_text(size=8,angle=45,hjust=1)
,axis.text.y=element_text(size=8)
,strip.background = element_rect(fill='#636363')
)
}
plot_operations = function(filtered,title,ncol=plotCols,autoSize = ~(.Platform$GUI == "RStudio")){
for (st in sort(unique(filtered$subType))){
data=filter(filtered,subType==st)
p=plot_operations_get(data,title=paste0(title,' - ' , st),subtitle=' (All Operations) ')
subchunkify(p, fig_height=ceiling(n_distinct(data$Operation)/plotCols)*3+.5, fig_width=plotCols*3)
data=plot_get_significant(filter(filtered,subType==st))
if(!is.null(data)) {
p=plot_operations_get(data,title=paste0(title,' - ' , st ),subtitle=' (Significant Operations) ')
subchunkify(p, fig_height=ceiling(n_distinct(data$Operation)/plotCols)*3+.5, fig_width=plotCols*3)
}
}
}
getTable = function(data){
t=data %>% group_by(subType,Operation,Dataset,Language) %>%
summarise_at(.vars=targetVar,.funs=vars(mean)) %>%
ungroup() %>%
spread(key=Language,value=targetVar) %>%
mutate(delta = PySpark-Scala)
knitr::kable(t,digits=2,col.names=c('Category','Operation','Dataset','PySpark (sec)','Scala (sec)','PySpark - Scala'),padding=0,format='markdown')
}
Comparing Row operations
Rows Operations - All Environments:
| Filtering |
Filter |
10 |
19.27 |
22.92 |
-3.64 |
| Filtering |
Filter |
100 |
50.36 |
51.01 |
-0.65 |
| Filtering |
Filter |
200 |
52.92 |
62.15 |
-9.24 |
| Filtering |
Filter |
300 |
48.25 |
65.61 |
-17.36 |
| Filtering |
Filter |
500 |
57.79 |
62.15 |
-4.36 |
| Filtering |
Filter Reg Ex 1 |
10 |
21.49 |
25.13 |
-3.63 |
| Filtering |
Filter Reg Ex 1 |
100 |
68.48 |
67.66 |
0.82 |
| Filtering |
Filter Reg Ex 1 |
200 |
78.96 |
74.62 |
4.34 |
| Filtering |
Filter Reg Ex 1 |
300 |
68.20 |
82.86 |
-14.66 |
| Filtering |
Filter Reg Ex 1 |
500 |
83.87 |
75.10 |
8.77 |
| Filtering |
Filter Reg Ex 2 |
10 |
21.97 |
25.68 |
-3.71 |
| Filtering |
Filter Reg Ex 2 |
100 |
69.72 |
66.23 |
3.49 |
| Filtering |
Filter Reg Ex 2 |
200 |
77.99 |
79.03 |
-1.04 |
| Filtering |
Filter Reg Ex 2 |
300 |
70.58 |
81.03 |
-10.45 |
| Filtering |
Filter Reg Ex 2 |
500 |
81.77 |
77.49 |
4.28 |
| Run/Shift |
Running Sum |
10 |
9.13 |
26.26 |
-17.12 |
| Run/Shift |
Running Sum |
100 |
16.30 |
92.15 |
-75.85 |
| Run/Shift |
Running Sum |
200 |
14.43 |
116.36 |
-101.93 |
| Run/Shift |
Running Sum |
300 |
12.02 |
117.70 |
-105.68 |
| Run/Shift |
Running Sum |
500 |
10.30 |
129.19 |
-118.89 |
| Run/Shift |
Shift (Lag) |
10 |
9.28 |
26.43 |
-17.15 |
| Run/Shift |
Shift (Lag) |
100 |
16.10 |
101.13 |
-85.03 |
| Run/Shift |
Shift (Lag) |
200 |
13.32 |
129.59 |
-116.27 |
| Run/Shift |
Shift (Lag) |
300 |
11.97 |
127.05 |
-115.08 |
| Run/Shift |
Shift (Lag) |
500 |
11.00 |
130.30 |
-119.30 |
| Writing |
Writing 100 new rows |
10 |
12.27 |
18.44 |
-6.17 |
| Writing |
Writing 100 new rows |
100 |
24.24 |
43.73 |
-19.48 |
| Writing |
Writing 100 new rows |
200 |
21.47 |
38.30 |
-16.83 |
| Writing |
Writing 100 new rows |
300 |
18.38 |
32.15 |
-13.77 |
| Writing |
Writing 100 new rows |
500 |
17.80 |
27.52 |
-9.72 |
| Writing |
Writing 1000 new rows |
10 |
13.14 |
18.02 |
-4.88 |
| Writing |
Writing 1000 new rows |
100 |
24.42 |
41.41 |
-16.99 |
| Writing |
Writing 1000 new rows |
200 |
21.48 |
37.49 |
-16.01 |
| Writing |
Writing 1000 new rows |
300 |
17.25 |
30.80 |
-13.55 |
| Writing |
Writing 1000 new rows |
500 |
16.75 |
28.79 |
-12.04 |
| Writing |
Writing 10000 new rows |
10 |
8.78 |
13.52 |
-4.74 |
| Writing |
Writing 10000 new rows |
100 |
21.81 |
38.44 |
-16.63 |
| Writing |
Writing 10000 new rows |
200 |
18.05 |
33.03 |
-14.98 |
| Writing |
Writing 10000 new rows |
300 |
15.96 |
32.82 |
-16.86 |
| Writing |
Writing 10000 new rows |
500 |
15.22 |
28.58 |
-13.36 |

## No Significant Elements



Rows Operations - Setup: Databricks
| Filtering |
Filter |
10 |
11.51 |
17.63 |
-6.12 |
| Filtering |
Filter |
100 |
53.18 |
49.69 |
3.49 |
| Filtering |
Filter |
200 |
55.69 |
68.63 |
-12.93 |
| Filtering |
Filter |
300 |
54.67 |
76.06 |
-21.39 |
| Filtering |
Filter |
500 |
67.35 |
63.36 |
3.99 |
| Filtering |
Filter Reg Ex 1 |
10 |
12.19 |
19.05 |
-6.85 |
| Filtering |
Filter Reg Ex 1 |
100 |
68.26 |
61.08 |
7.18 |
| Filtering |
Filter Reg Ex 1 |
200 |
85.08 |
79.32 |
5.75 |
| Filtering |
Filter Reg Ex 1 |
300 |
79.11 |
93.57 |
-14.46 |
| Filtering |
Filter Reg Ex 1 |
500 |
101.99 |
80.04 |
21.95 |
| Filtering |
Filter Reg Ex 2 |
10 |
13.48 |
18.23 |
-4.75 |
| Filtering |
Filter Reg Ex 2 |
100 |
68.72 |
64.62 |
4.10 |
| Filtering |
Filter Reg Ex 2 |
200 |
81.83 |
78.87 |
2.95 |
| Filtering |
Filter Reg Ex 2 |
300 |
83.05 |
88.46 |
-5.42 |
| Filtering |
Filter Reg Ex 2 |
500 |
96.51 |
80.21 |
16.30 |
| Run/Shift |
Running Sum |
10 |
5.35 |
18.67 |
-13.32 |
| Run/Shift |
Running Sum |
100 |
15.83 |
95.00 |
-79.17 |
| Run/Shift |
Running Sum |
200 |
14.21 |
118.52 |
-104.31 |
| Run/Shift |
Running Sum |
300 |
11.51 |
118.02 |
-106.51 |
| Run/Shift |
Running Sum |
500 |
9.70 |
98.83 |
-89.14 |
| Run/Shift |
Shift (Lag) |
10 |
5.53 |
20.33 |
-14.80 |
| Run/Shift |
Shift (Lag) |
100 |
15.30 |
92.75 |
-77.45 |
| Run/Shift |
Shift (Lag) |
200 |
11.57 |
119.28 |
-107.71 |
| Run/Shift |
Shift (Lag) |
300 |
10.62 |
125.22 |
-114.60 |
| Run/Shift |
Shift (Lag) |
500 |
10.38 |
110.26 |
-99.87 |
| Writing |
Writing 100 new rows |
10 |
7.93 |
13.09 |
-5.16 |
| Writing |
Writing 100 new rows |
100 |
24.88 |
45.05 |
-20.17 |
| Writing |
Writing 100 new rows |
200 |
22.27 |
30.82 |
-8.55 |
| Writing |
Writing 100 new rows |
300 |
17.20 |
23.80 |
-6.59 |
| Writing |
Writing 100 new rows |
500 |
15.87 |
23.61 |
-7.74 |
| Writing |
Writing 1000 new rows |
10 |
8.60 |
13.48 |
-4.88 |
| Writing |
Writing 1000 new rows |
100 |
24.80 |
40.75 |
-15.95 |
| Writing |
Writing 1000 new rows |
200 |
21.13 |
29.89 |
-8.77 |
| Writing |
Writing 1000 new rows |
300 |
16.93 |
25.35 |
-8.42 |
| Writing |
Writing 1000 new rows |
500 |
15.15 |
23.68 |
-8.53 |
| Writing |
Writing 10000 new rows |
10 |
6.11 |
10.67 |
-4.56 |
| Writing |
Writing 10000 new rows |
100 |
23.35 |
39.76 |
-16.40 |
| Writing |
Writing 10000 new rows |
200 |
16.02 |
22.83 |
-6.81 |
| Writing |
Writing 10000 new rows |
300 |
14.16 |
20.47 |
-6.31 |
| Writing |
Writing 10000 new rows |
500 |
13.54 |
19.58 |
-6.04 |

## No Significant Elements



Rows Operations - Setup: Local VM
| Filtering |
Filter |
10 |
21.02 |
23.17 |
-2.15 |
| Filtering |
Filter |
100 |
30.70 |
32.28 |
-1.58 |
| Filtering |
Filter |
200 |
32.08 |
33.25 |
-1.17 |
| Filtering |
Filter |
300 |
20.36 |
30.21 |
-9.85 |
| Filtering |
Filter |
500 |
26.49 |
30.75 |
-4.26 |
| Filtering |
Filter Reg Ex 1 |
10 |
22.81 |
25.65 |
-2.84 |
| Filtering |
Filter Reg Ex 1 |
100 |
41.95 |
49.37 |
-7.42 |
| Filtering |
Filter Reg Ex 1 |
200 |
42.05 |
36.45 |
5.60 |
| Filtering |
Filter Reg Ex 1 |
300 |
20.97 |
38.87 |
-17.90 |
| Filtering |
Filter Reg Ex 1 |
500 |
31.33 |
31.87 |
-0.53 |
| Filtering |
Filter Reg Ex 2 |
10 |
23.26 |
27.20 |
-3.94 |
| Filtering |
Filter Reg Ex 2 |
100 |
41.97 |
37.79 |
4.18 |
| Filtering |
Filter Reg Ex 2 |
200 |
43.21 |
48.02 |
-4.81 |
| Filtering |
Filter Reg Ex 2 |
300 |
22.49 |
39.11 |
-16.62 |
| Filtering |
Filter Reg Ex 2 |
500 |
31.84 |
38.54 |
-6.70 |
| Run/Shift |
Running Sum |
10 |
10.76 |
28.61 |
-17.86 |
| Run/Shift |
Running Sum |
100 |
13.60 |
59.99 |
-46.39 |
| Run/Shift |
Running Sum |
200 |
10.06 |
75.79 |
-65.73 |
| Run/Shift |
Running Sum |
300 |
6.75 |
72.37 |
-65.61 |
| Run/Shift |
Running Sum |
500 |
5.75 |
110.91 |
-105.16 |
| Run/Shift |
Shift (Lag) |
10 |
10.64 |
26.48 |
-15.83 |
| Run/Shift |
Shift (Lag) |
100 |
13.43 |
75.41 |
-61.98 |
| Run/Shift |
Shift (Lag) |
200 |
9.78 |
100.40 |
-90.62 |
| Run/Shift |
Shift (Lag) |
300 |
7.52 |
87.25 |
-79.74 |
| Run/Shift |
Shift (Lag) |
500 |
5.69 |
102.97 |
-97.28 |
| Writing |
Writing 100 new rows |
10 |
12.40 |
19.56 |
-7.16 |
| Writing |
Writing 100 new rows |
100 |
16.86 |
24.83 |
-7.97 |
| Writing |
Writing 100 new rows |
200 |
13.37 |
30.54 |
-17.17 |
| Writing |
Writing 100 new rows |
300 |
10.65 |
21.70 |
-11.05 |
| Writing |
Writing 100 new rows |
500 |
11.25 |
16.19 |
-4.94 |
| Writing |
Writing 1000 new rows |
10 |
13.96 |
18.04 |
-4.08 |
| Writing |
Writing 1000 new rows |
100 |
17.62 |
25.95 |
-8.33 |
| Writing |
Writing 1000 new rows |
200 |
15.71 |
28.17 |
-12.47 |
| Writing |
Writing 1000 new rows |
300 |
9.01 |
19.73 |
-10.73 |
| Writing |
Writing 1000 new rows |
500 |
10.05 |
18.24 |
-8.18 |
| Writing |
Writing 10000 new rows |
10 |
8.35 |
12.35 |
-4.00 |
| Writing |
Writing 10000 new rows |
100 |
15.16 |
21.15 |
-5.99 |
| Writing |
Writing 10000 new rows |
200 |
12.81 |
27.22 |
-14.41 |
| Writing |
Writing 10000 new rows |
300 |
9.21 |
28.88 |
-19.66 |
| Writing |
Writing 10000 new rows |
500 |
9.45 |
20.65 |
-11.20 |

## No Significant Elements



Rows Operations - Setup: Cluster
| Filtering |
Filter |
10 |
31.31 |
32.99 |
-1.67 |
| Filtering |
Filter |
100 |
84.01 |
91.11 |
-7.09 |
| Filtering |
Filter |
200 |
89.04 |
107.01 |
-17.97 |
| Filtering |
Filter |
300 |
92.50 |
115.49 |
-23.00 |
| Filtering |
Filter |
500 |
101.28 |
122.53 |
-21.25 |
| Filtering |
Filter Reg Ex 1 |
10 |
37.45 |
36.23 |
1.22 |
| Filtering |
Filter Reg Ex 1 |
100 |
121.98 |
117.41 |
4.57 |
| Filtering |
Filter Reg Ex 1 |
200 |
140.55 |
141.56 |
-1.00 |
| Filtering |
Filter Reg Ex 1 |
300 |
143.03 |
149.45 |
-6.42 |
| Filtering |
Filter Reg Ex 1 |
500 |
152.68 |
151.66 |
1.02 |
| Filtering |
Filter Reg Ex 2 |
10 |
36.36 |
37.53 |
-1.17 |
| Filtering |
Filter Reg Ex 2 |
100 |
127.24 |
126.33 |
0.91 |
| Filtering |
Filter Reg Ex 2 |
200 |
139.89 |
141.37 |
-1.48 |
| Filtering |
Filter Reg Ex 2 |
300 |
144.30 |
149.99 |
-5.69 |
| Filtering |
Filter Reg Ex 2 |
500 |
152.17 |
149.97 |
2.20 |
| Run/Shift |
Running Sum |
10 |
13.46 |
36.73 |
-23.27 |
| Run/Shift |
Running Sum |
100 |
22.65 |
150.75 |
-128.10 |
| Run/Shift |
Running Sum |
200 |
23.63 |
193.21 |
-169.58 |
| Run/Shift |
Running Sum |
300 |
23.58 |
207.74 |
-184.16 |
| Run/Shift |
Running Sum |
500 |
20.62 |
226.48 |
-205.87 |
| Run/Shift |
Shift (Lag) |
10 |
14.06 |
38.55 |
-24.49 |
| Run/Shift |
Shift (Lag) |
100 |
23.03 |
169.31 |
-146.29 |
| Run/Shift |
Shift (Lag) |
200 |
23.91 |
208.61 |
-184.70 |
| Run/Shift |
Shift (Lag) |
300 |
23.58 |
210.29 |
-186.72 |
| Run/Shift |
Shift (Lag) |
500 |
22.87 |
225.04 |
-202.17 |
| Writing |
Writing 100 new rows |
10 |
20.71 |
26.92 |
-6.21 |
| Writing |
Writing 100 new rows |
100 |
37.73 |
78.87 |
-41.14 |
| Writing |
Writing 100 new rows |
200 |
36.07 |
68.78 |
-32.71 |
| Writing |
Writing 100 new rows |
300 |
36.18 |
69.78 |
-33.60 |
| Writing |
Writing 100 new rows |
500 |
34.77 |
58.01 |
-23.24 |
| Writing |
Writing 1000 new rows |
10 |
20.60 |
27.09 |
-6.49 |
| Writing |
Writing 1000 new rows |
100 |
37.26 |
73.64 |
-36.38 |
| Writing |
Writing 1000 new rows |
200 |
33.73 |
71.30 |
-37.57 |
| Writing |
Writing 1000 new rows |
300 |
34.31 |
63.84 |
-29.53 |
| Writing |
Writing 1000 new rows |
500 |
33.33 |
60.10 |
-26.77 |
| Writing |
Writing 10000 new rows |
10 |
14.98 |
21.53 |
-6.56 |
| Writing |
Writing 10000 new rows |
100 |
32.02 |
70.39 |
-38.37 |
| Writing |
Writing 10000 new rows |
200 |
32.61 |
65.08 |
-32.47 |
| Writing |
Writing 10000 new rows |
300 |
33.06 |
65.40 |
-32.34 |
| Writing |
Writing 10000 new rows |
500 |
30.13 |
62.45 |
-32.32 |

## Warning in wilcox.test.default(x = c(37.3931337469713, 34.3061034620522, :
## cannot compute exact p-value with ties


## Warning in wilcox.test.default(x = c(14.4125942386937, 13.3451278278002, :
## cannot compute exact p-value with ties



Comparing Columns operations
Columns Operations - All Environments:
| Joining |
Full Outer Join 10 Columns |
10 |
2.06 |
2.67 |
-0.61 |
| Joining |
Full Outer Join 10 Columns |
100 |
8.26 |
11.53 |
-3.28 |
| Joining |
Full Outer Join 5 Columns |
10 |
2.33 |
3.13 |
-0.81 |
| Joining |
Full Outer Join 5 Columns |
100 |
4.20 |
6.31 |
-2.11 |
| Joining |
Inner Join 10 Columns |
10 |
6.00 |
7.18 |
-1.19 |
| Joining |
Inner Join 10 Columns |
100 |
8.81 |
11.95 |
-3.13 |
| Joining |
Inner Join 5 Columns |
10 |
5.57 |
6.25 |
-0.69 |
| Joining |
Inner Join 5 Columns |
100 |
7.14 |
9.45 |
-2.31 |
| Joining |
Left Outer Join 10 Columns |
10 |
5.64 |
6.59 |
-0.95 |
| Joining |
Left Outer Join 10 Columns |
100 |
8.17 |
11.73 |
-3.56 |
| Joining |
Left Outer Join 5 Columns |
10 |
5.33 |
6.69 |
-1.36 |
| Joining |
Left Outer Join 5 Columns |
100 |
4.25 |
5.65 |
-1.40 |
| Mathematics |
Mathematical Operation on Columns |
10 |
29.64 |
29.35 |
0.29 |
| Mathematics |
Mathematical Operation on Columns |
100 |
115.50 |
107.61 |
7.89 |
| Mathematics |
Mathematical Operation on Columns |
200 |
134.69 |
122.18 |
12.52 |
| Mathematics |
Mathematical Operation on Columns |
300 |
120.30 |
142.56 |
-22.26 |
| Mathematics |
Mathematical Operation on Columns |
500 |
121.49 |
126.37 |
-4.87 |
| Merging |
Merge 10 columns into 1 |
10 |
10.72 |
15.12 |
-4.40 |
| Merging |
Merge 10 columns into 1 |
100 |
20.63 |
33.84 |
-13.22 |
| Merging |
Merge 10 columns into 1 |
200 |
16.40 |
29.93 |
-13.52 |
| Merging |
Merge 10 columns into 1 |
300 |
14.21 |
27.80 |
-13.59 |
| Merging |
Merge 10 columns into 1 |
500 |
12.07 |
23.88 |
-11.81 |
| Merging |
Merge 2 columns into 1 |
10 |
11.34 |
15.93 |
-4.60 |
| Merging |
Merge 2 columns into 1 |
100 |
21.95 |
36.24 |
-14.28 |
| Merging |
Merge 2 columns into 1 |
200 |
19.30 |
31.15 |
-11.85 |
| Merging |
Merge 2 columns into 1 |
300 |
16.84 |
29.77 |
-12.93 |
| Merging |
Merge 2 columns into 1 |
500 |
15.18 |
26.39 |
-11.20 |
| Merging |
Merge 5 columns into 1 |
10 |
10.72 |
16.14 |
-5.42 |
| Merging |
Merge 5 columns into 1 |
100 |
20.75 |
34.68 |
-13.93 |
| Merging |
Merge 5 columns into 1 |
200 |
17.85 |
30.85 |
-13.00 |
| Merging |
Merge 5 columns into 1 |
300 |
15.48 |
28.68 |
-13.20 |
| Merging |
Merge 5 columns into 1 |
500 |
14.01 |
25.71 |
-11.70 |
| Sorting |
Sorting Asc 1 column |
10 |
6.30 |
8.16 |
-1.87 |
| Sorting |
Sorting Asc 1 column |
100 |
15.49 |
20.85 |
-5.36 |
| Sorting |
Sorting Asc 1 column |
200 |
8.15 |
10.40 |
-2.25 |
| Sorting |
Sorting Asc 1 column |
300 |
6.75 |
8.34 |
-1.59 |
| Sorting |
Sorting Asc 1 column |
500 |
5.96 |
8.14 |
-2.18 |
| Sorting |
Sorting Asc 10 column |
10 |
5.25 |
6.51 |
-1.26 |
| Sorting |
Sorting Asc 10 column |
100 |
14.67 |
19.13 |
-4.45 |
| Sorting |
Sorting Asc 10 column |
200 |
7.49 |
10.24 |
-2.75 |
| Sorting |
Sorting Asc 10 column |
300 |
6.43 |
8.27 |
-1.84 |
| Sorting |
Sorting Asc 10 column |
500 |
6.17 |
8.67 |
-2.51 |
| Sorting |
Sorting Asc 5 column |
10 |
6.20 |
7.68 |
-1.49 |
| Sorting |
Sorting Asc 5 column |
100 |
15.32 |
20.34 |
-5.02 |
| Sorting |
Sorting Asc 5 column |
200 |
8.72 |
10.54 |
-1.82 |
| Sorting |
Sorting Asc 5 column |
300 |
6.12 |
7.64 |
-1.52 |
| Sorting |
Sorting Asc 5 column |
500 |
6.39 |
7.98 |
-1.59 |
| Sorting |
Sorting Desc 1 column |
10 |
6.41 |
8.37 |
-1.96 |
| Sorting |
Sorting Desc 1 column |
100 |
15.47 |
20.09 |
-4.62 |
| Sorting |
Sorting Desc 1 column |
200 |
7.46 |
11.38 |
-3.92 |
| Sorting |
Sorting Desc 1 column |
300 |
6.64 |
9.04 |
-2.40 |
| Sorting |
Sorting Desc 1 column |
500 |
5.35 |
8.81 |
-3.46 |
| Sorting |
Sorting Desc 10 column |
10 |
5.61 |
7.31 |
-1.70 |
| Sorting |
Sorting Desc 10 column |
100 |
14.74 |
19.17 |
-4.43 |
| Sorting |
Sorting Desc 10 column |
200 |
7.56 |
9.35 |
-1.79 |
| Sorting |
Sorting Desc 10 column |
300 |
6.24 |
9.37 |
-3.13 |
| Sorting |
Sorting Desc 10 column |
500 |
6.28 |
8.16 |
-1.88 |
| Sorting |
Sorting Desc 5 column |
10 |
6.08 |
7.37 |
-1.29 |
| Sorting |
Sorting Desc 5 column |
100 |
15.06 |
19.34 |
-4.27 |
| Sorting |
Sorting Desc 5 column |
200 |
8.07 |
9.84 |
-1.76 |
| Sorting |
Sorting Desc 5 column |
300 |
6.38 |
9.36 |
-2.98 |
| Sorting |
Sorting Desc 5 column |
500 |
6.22 |
9.35 |
-3.13 |
| Splitting |
Split 1 Column into 10 |
10 |
6.52 |
9.39 |
-2.87 |
| Splitting |
Split 1 Column into 10 |
100 |
14.69 |
25.61 |
-10.93 |
| Splitting |
Split 1 Column into 10 |
200 |
13.67 |
23.19 |
-9.52 |
| Splitting |
Split 1 Column into 10 |
300 |
11.62 |
22.26 |
-10.64 |
| Splitting |
Split 1 Column into 10 |
500 |
11.29 |
19.74 |
-8.45 |
| Splitting |
Split 1 Column into 5 |
10 |
7.88 |
11.25 |
-3.36 |
| Splitting |
Split 1 Column into 5 |
100 |
16.73 |
27.86 |
-11.13 |
| Splitting |
Split 1 Column into 5 |
200 |
14.81 |
28.49 |
-13.68 |
| Splitting |
Split 1 Column into 5 |
300 |
13.86 |
26.04 |
-12.17 |
| Splitting |
Split 1 Column into 5 |
500 |
13.22 |
23.25 |
-10.03 |



## No Significant Elements





Columns Operations - Setup:Databricks
| Joining |
Full Outer Join 10 Columns |
10 |
1.60 |
2.12 |
-0.51 |
| Joining |
Full Outer Join 10 Columns |
100 |
7.32 |
8.10 |
-0.78 |
| Joining |
Full Outer Join 5 Columns |
10 |
1.64 |
2.63 |
-0.99 |
| Joining |
Full Outer Join 5 Columns |
100 |
2.17 |
3.95 |
-1.78 |
| Joining |
Inner Join 10 Columns |
10 |
3.42 |
5.61 |
-2.19 |
| Joining |
Inner Join 10 Columns |
100 |
7.93 |
9.15 |
-1.22 |
| Joining |
Inner Join 5 Columns |
10 |
3.55 |
5.13 |
-1.58 |
| Joining |
Inner Join 5 Columns |
100 |
6.36 |
7.32 |
-0.96 |
| Joining |
Left Outer Join 10 Columns |
10 |
3.55 |
5.24 |
-1.69 |
| Joining |
Left Outer Join 10 Columns |
100 |
7.24 |
8.34 |
-1.10 |
| Joining |
Left Outer Join 5 Columns |
10 |
3.08 |
5.38 |
-2.29 |
| Joining |
Left Outer Join 5 Columns |
100 |
2.08 |
2.96 |
-0.88 |
| Mathematics |
Mathematical Operation on Columns |
10 |
19.70 |
22.65 |
-2.95 |
| Mathematics |
Mathematical Operation on Columns |
100 |
114.39 |
103.84 |
10.55 |
| Mathematics |
Mathematical Operation on Columns |
200 |
148.64 |
114.14 |
34.50 |
| Mathematics |
Mathematical Operation on Columns |
300 |
131.89 |
139.29 |
-7.40 |
| Mathematics |
Mathematical Operation on Columns |
500 |
124.85 |
111.49 |
13.36 |
| Merging |
Merge 10 columns into 1 |
10 |
7.52 |
11.82 |
-4.30 |
| Merging |
Merge 10 columns into 1 |
100 |
22.09 |
34.39 |
-12.30 |
| Merging |
Merge 10 columns into 1 |
200 |
15.40 |
24.47 |
-9.07 |
| Merging |
Merge 10 columns into 1 |
300 |
13.05 |
21.64 |
-8.60 |
| Merging |
Merge 10 columns into 1 |
500 |
13.72 |
16.96 |
-3.24 |
| Merging |
Merge 2 columns into 1 |
10 |
7.18 |
13.35 |
-6.17 |
| Merging |
Merge 2 columns into 1 |
100 |
22.57 |
36.28 |
-13.71 |
| Merging |
Merge 2 columns into 1 |
200 |
18.55 |
23.31 |
-4.76 |
| Merging |
Merge 2 columns into 1 |
300 |
15.42 |
23.72 |
-8.30 |
| Merging |
Merge 2 columns into 1 |
500 |
14.97 |
21.23 |
-6.27 |
| Merging |
Merge 5 columns into 1 |
10 |
7.38 |
12.92 |
-5.54 |
| Merging |
Merge 5 columns into 1 |
100 |
21.46 |
35.06 |
-13.60 |
| Merging |
Merge 5 columns into 1 |
200 |
17.51 |
23.94 |
-6.43 |
| Merging |
Merge 5 columns into 1 |
300 |
14.34 |
22.68 |
-8.34 |
| Merging |
Merge 5 columns into 1 |
500 |
12.93 |
19.60 |
-6.67 |
| Sorting |
Sorting Asc 1 column |
10 |
4.03 |
6.27 |
-2.24 |
| Sorting |
Sorting Asc 1 column |
100 |
15.42 |
18.59 |
-3.17 |
| Sorting |
Sorting Asc 1 column |
200 |
5.99 |
4.71 |
1.27 |
| Sorting |
Sorting Asc 1 column |
300 |
4.31 |
2.83 |
1.48 |
| Sorting |
Sorting Asc 1 column |
500 |
3.86 |
3.62 |
0.24 |
| Sorting |
Sorting Asc 10 column |
10 |
3.61 |
5.32 |
-1.71 |
| Sorting |
Sorting Asc 10 column |
100 |
14.24 |
18.57 |
-4.33 |
| Sorting |
Sorting Asc 10 column |
200 |
4.47 |
5.49 |
-1.02 |
| Sorting |
Sorting Asc 10 column |
300 |
3.83 |
2.80 |
1.04 |
| Sorting |
Sorting Asc 10 column |
500 |
3.71 |
3.44 |
0.26 |
| Sorting |
Sorting Asc 5 column |
10 |
4.16 |
5.65 |
-1.48 |
| Sorting |
Sorting Asc 5 column |
100 |
15.49 |
19.07 |
-3.58 |
| Sorting |
Sorting Asc 5 column |
200 |
5.20 |
5.52 |
-0.32 |
| Sorting |
Sorting Asc 5 column |
300 |
3.77 |
3.12 |
0.65 |
| Sorting |
Sorting Asc 5 column |
500 |
3.76 |
3.87 |
-0.11 |
| Sorting |
Sorting Desc 1 column |
10 |
4.18 |
5.63 |
-1.45 |
| Sorting |
Sorting Desc 1 column |
100 |
16.39 |
18.67 |
-2.29 |
| Sorting |
Sorting Desc 1 column |
200 |
4.62 |
6.12 |
-1.51 |
| Sorting |
Sorting Desc 1 column |
300 |
3.30 |
3.22 |
0.08 |
| Sorting |
Sorting Desc 1 column |
500 |
3.59 |
4.06 |
-0.47 |
| Sorting |
Sorting Desc 10 column |
10 |
3.59 |
5.53 |
-1.94 |
| Sorting |
Sorting Desc 10 column |
100 |
14.31 |
17.78 |
-3.47 |
| Sorting |
Sorting Desc 10 column |
200 |
4.44 |
3.95 |
0.49 |
| Sorting |
Sorting Desc 10 column |
300 |
3.60 |
3.65 |
-0.05 |
| Sorting |
Sorting Desc 10 column |
500 |
3.78 |
3.85 |
-0.07 |
| Sorting |
Sorting Desc 5 column |
10 |
3.80 |
5.57 |
-1.77 |
| Sorting |
Sorting Desc 5 column |
100 |
15.06 |
18.25 |
-3.19 |
| Sorting |
Sorting Desc 5 column |
200 |
4.37 |
3.50 |
0.86 |
| Sorting |
Sorting Desc 5 column |
300 |
3.66 |
3.04 |
0.62 |
| Sorting |
Sorting Desc 5 column |
500 |
3.36 |
3.78 |
-0.41 |
| Splitting |
Split 1 Column into 10 |
10 |
5.61 |
9.22 |
-3.61 |
| Splitting |
Split 1 Column into 10 |
100 |
18.40 |
28.60 |
-10.20 |
| Splitting |
Split 1 Column into 10 |
200 |
16.25 |
21.32 |
-5.07 |
| Splitting |
Split 1 Column into 10 |
300 |
12.15 |
19.84 |
-7.69 |
| Splitting |
Split 1 Column into 10 |
500 |
11.74 |
16.25 |
-4.51 |
| Splitting |
Split 1 Column into 5 |
10 |
6.45 |
10.41 |
-3.96 |
| Splitting |
Split 1 Column into 5 |
100 |
20.23 |
30.66 |
-10.43 |
| Splitting |
Split 1 Column into 5 |
200 |
15.96 |
24.03 |
-8.07 |
| Splitting |
Split 1 Column into 5 |
300 |
13.94 |
21.04 |
-7.10 |
| Splitting |
Split 1 Column into 5 |
500 |
12.98 |
19.26 |
-6.28 |

## Warning in wilcox.test.default(x = c(2.58357031716984, 1.37888996360214, :
## cannot compute exact p-value with ties


## No Significant Elements



## No Significant Elements

Columns Operations - Setup:Local VM
| Joining |
Full Outer Join 10 Columns |
10 |
1.19 |
1.41 |
-0.22 |
| Joining |
Full Outer Join 10 Columns |
100 |
5.11 |
5.89 |
-0.77 |
| Joining |
Full Outer Join 5 Columns |
10 |
1.32 |
1.80 |
-0.49 |
| Joining |
Full Outer Join 5 Columns |
100 |
4.27 |
5.15 |
-0.88 |
| Joining |
Inner Join 10 Columns |
10 |
6.93 |
6.19 |
0.74 |
| Joining |
Inner Join 10 Columns |
100 |
5.37 |
7.27 |
-1.90 |
| Joining |
Inner Join 5 Columns |
10 |
5.73 |
5.55 |
0.18 |
| Joining |
Inner Join 5 Columns |
100 |
4.83 |
4.76 |
0.07 |
| Joining |
Left Outer Join 10 Columns |
10 |
5.96 |
5.79 |
0.17 |
| Joining |
Left Outer Join 10 Columns |
100 |
5.16 |
5.95 |
-0.79 |
| Joining |
Left Outer Join 5 Columns |
10 |
5.56 |
5.51 |
0.05 |
| Joining |
Left Outer Join 5 Columns |
100 |
4.41 |
5.54 |
-1.13 |
| Mathematics |
Mathematical Operation on Columns |
10 |
33.32 |
31.63 |
1.69 |
| Mathematics |
Mathematical Operation on Columns |
100 |
83.60 |
79.70 |
3.90 |
| Mathematics |
Mathematical Operation on Columns |
200 |
86.13 |
84.61 |
1.52 |
| Mathematics |
Mathematical Operation on Columns |
300 |
51.49 |
107.36 |
-55.87 |
| Mathematics |
Mathematical Operation on Columns |
500 |
56.93 |
87.55 |
-30.62 |
| Merging |
Merge 10 columns into 1 |
10 |
10.53 |
14.90 |
-4.36 |
| Merging |
Merge 10 columns into 1 |
100 |
13.99 |
21.19 |
-7.20 |
| Merging |
Merge 10 columns into 1 |
200 |
11.17 |
22.06 |
-10.89 |
| Merging |
Merge 10 columns into 1 |
300 |
7.75 |
22.86 |
-15.11 |
| Merging |
Merge 10 columns into 1 |
500 |
7.31 |
19.63 |
-12.32 |
| Merging |
Merge 2 columns into 1 |
10 |
11.63 |
14.55 |
-2.92 |
| Merging |
Merge 2 columns into 1 |
100 |
14.94 |
23.00 |
-8.06 |
| Merging |
Merge 2 columns into 1 |
200 |
13.07 |
23.60 |
-10.53 |
| Merging |
Merge 2 columns into 1 |
300 |
9.88 |
20.82 |
-10.94 |
| Merging |
Merge 2 columns into 1 |
500 |
8.22 |
17.24 |
-9.02 |
| Merging |
Merge 5 columns into 1 |
10 |
10.32 |
15.24 |
-4.91 |
| Merging |
Merge 5 columns into 1 |
100 |
14.25 |
22.33 |
-8.07 |
| Merging |
Merge 5 columns into 1 |
200 |
11.74 |
23.80 |
-12.05 |
| Merging |
Merge 5 columns into 1 |
300 |
9.70 |
20.63 |
-10.93 |
| Merging |
Merge 5 columns into 1 |
500 |
8.82 |
18.44 |
-9.61 |
| Sorting |
Sorting Asc 1 column |
10 |
5.72 |
6.99 |
-1.27 |
| Sorting |
Sorting Asc 1 column |
100 |
9.68 |
12.80 |
-3.13 |
| Sorting |
Sorting Asc 1 column |
200 |
7.94 |
12.76 |
-4.81 |
| Sorting |
Sorting Asc 1 column |
300 |
5.82 |
10.06 |
-4.24 |
| Sorting |
Sorting Asc 1 column |
500 |
6.22 |
8.70 |
-2.48 |
| Sorting |
Sorting Asc 10 column |
10 |
4.21 |
5.09 |
-0.88 |
| Sorting |
Sorting Asc 10 column |
100 |
9.03 |
10.58 |
-1.55 |
| Sorting |
Sorting Asc 10 column |
200 |
7.99 |
11.98 |
-3.99 |
| Sorting |
Sorting Asc 10 column |
300 |
5.72 |
9.60 |
-3.88 |
| Sorting |
Sorting Asc 10 column |
500 |
7.01 |
10.14 |
-3.13 |
| Sorting |
Sorting Asc 5 column |
10 |
5.07 |
6.62 |
-1.55 |
| Sorting |
Sorting Asc 5 column |
100 |
9.56 |
11.11 |
-1.55 |
| Sorting |
Sorting Asc 5 column |
200 |
10.13 |
13.13 |
-3.00 |
| Sorting |
Sorting Asc 5 column |
300 |
4.54 |
8.60 |
-4.05 |
| Sorting |
Sorting Asc 5 column |
500 |
6.94 |
8.08 |
-1.13 |
| Sorting |
Sorting Desc 1 column |
10 |
5.57 |
8.06 |
-2.49 |
| Sorting |
Sorting Desc 1 column |
100 |
8.82 |
9.92 |
-1.10 |
| Sorting |
Sorting Desc 1 column |
200 |
7.61 |
13.82 |
-6.20 |
| Sorting |
Sorting Desc 1 column |
300 |
6.31 |
11.88 |
-5.57 |
| Sorting |
Sorting Desc 1 column |
500 |
5.32 |
9.82 |
-4.50 |
| Sorting |
Sorting Desc 10 column |
10 |
4.45 |
6.25 |
-1.80 |
| Sorting |
Sorting Desc 10 column |
100 |
9.24 |
9.78 |
-0.54 |
| Sorting |
Sorting Desc 10 column |
200 |
8.21 |
11.56 |
-3.36 |
| Sorting |
Sorting Desc 10 column |
300 |
5.22 |
11.86 |
-6.65 |
| Sorting |
Sorting Desc 10 column |
500 |
6.79 |
8.61 |
-1.82 |
| Sorting |
Sorting Desc 5 column |
10 |
5.17 |
6.65 |
-1.48 |
| Sorting |
Sorting Desc 5 column |
100 |
9.85 |
9.87 |
-0.02 |
| Sorting |
Sorting Desc 5 column |
200 |
9.52 |
12.98 |
-3.46 |
| Sorting |
Sorting Desc 5 column |
300 |
5.46 |
12.27 |
-6.81 |
| Sorting |
Sorting Desc 5 column |
500 |
7.17 |
11.57 |
-4.40 |
| Splitting |
Split 1 Column into 10 |
10 |
4.07 |
5.97 |
-1.90 |
| Splitting |
Split 1 Column into 10 |
100 |
4.44 |
10.26 |
-5.82 |
| Splitting |
Split 1 Column into 10 |
200 |
4.32 |
12.36 |
-8.03 |
| Splitting |
Split 1 Column into 10 |
300 |
3.19 |
11.91 |
-8.72 |
| Splitting |
Split 1 Column into 10 |
500 |
3.13 |
9.48 |
-6.35 |
| Splitting |
Split 1 Column into 5 |
10 |
5.51 |
8.40 |
-2.89 |
| Splitting |
Split 1 Column into 5 |
100 |
6.48 |
12.71 |
-6.23 |
| Splitting |
Split 1 Column into 5 |
200 |
6.03 |
21.34 |
-15.31 |
| Splitting |
Split 1 Column into 5 |
300 |
4.94 |
17.74 |
-12.80 |
| Splitting |
Split 1 Column into 5 |
500 |
4.97 |
14.80 |
-9.82 |

## No Significant Elements

## No Significant Elements





Columns Operations - Setup:Cluster
| Joining |
Full Outer Join 10 Columns |
10 |
4.70 |
5.02 |
-0.32 |
| Joining |
Full Outer Join 10 Columns |
100 |
16.41 |
24.04 |
-7.63 |
| Joining |
Full Outer Join 5 Columns |
10 |
5.73 |
5.47 |
0.26 |
| Joining |
Full Outer Join 5 Columns |
100 |
8.12 |
12.20 |
-4.08 |
| Joining |
Inner Join 10 Columns |
10 |
9.28 |
11.31 |
-2.04 |
| Joining |
Inner Join 10 Columns |
100 |
17.47 |
22.21 |
-4.74 |
| Joining |
Inner Join 5 Columns |
10 |
9.27 |
9.20 |
0.07 |
| Joining |
Inner Join 5 Columns |
100 |
13.30 |
18.40 |
-5.10 |
| Joining |
Left Outer Join 10 Columns |
10 |
9.17 |
10.07 |
-0.90 |
| Joining |
Left Outer Join 10 Columns |
100 |
16.08 |
24.28 |
-8.20 |
| Joining |
Left Outer Join 5 Columns |
10 |
9.35 |
10.49 |
-1.14 |
| Joining |
Left Outer Join 5 Columns |
100 |
8.28 |
11.14 |
-2.87 |
| Mathematics |
Mathematical Operation on Columns |
10 |
42.17 |
38.19 |
3.98 |
| Mathematics |
Mathematical Operation on Columns |
100 |
181.53 |
170.97 |
10.56 |
| Mathematics |
Mathematical Operation on Columns |
200 |
203.93 |
213.38 |
-9.45 |
| Mathematics |
Mathematical Operation on Columns |
300 |
237.07 |
219.48 |
17.59 |
| Mathematics |
Mathematical Operation on Columns |
500 |
243.91 |
233.76 |
10.15 |
| Merging |
Merge 10 columns into 1 |
10 |
17.48 |
22.18 |
-4.70 |
| Merging |
Merge 10 columns into 1 |
100 |
30.96 |
58.05 |
-27.09 |
| Merging |
Merge 10 columns into 1 |
200 |
28.88 |
56.57 |
-27.69 |
| Merging |
Merge 10 columns into 1 |
300 |
29.21 |
50.00 |
-20.79 |
| Merging |
Merge 10 columns into 1 |
500 |
18.27 |
46.19 |
-27.92 |
| Merging |
Merge 2 columns into 1 |
10 |
19.07 |
23.86 |
-4.79 |
| Merging |
Merge 2 columns into 1 |
100 |
34.76 |
62.62 |
-27.86 |
| Merging |
Merge 2 columns into 1 |
200 |
33.25 |
61.95 |
-28.69 |
| Merging |
Merge 2 columns into 1 |
300 |
33.62 |
59.77 |
-26.15 |
| Merging |
Merge 2 columns into 1 |
500 |
29.55 |
55.00 |
-25.45 |
| Merging |
Merge 5 columns into 1 |
10 |
18.17 |
24.38 |
-6.21 |
| Merging |
Merge 5 columns into 1 |
100 |
32.32 |
58.62 |
-26.30 |
| Merging |
Merge 5 columns into 1 |
200 |
30.74 |
58.76 |
-28.02 |
| Merging |
Merge 5 columns into 1 |
300 |
29.32 |
56.77 |
-27.45 |
| Merging |
Merge 5 columns into 1 |
500 |
26.54 |
52.47 |
-25.93 |
| Sorting |
Sorting Asc 1 column |
10 |
11.99 |
14.31 |
-2.32 |
| Sorting |
Sorting Asc 1 column |
100 |
27.27 |
41.46 |
-14.19 |
| Sorting |
Sorting Asc 1 column |
200 |
12.90 |
17.06 |
-4.16 |
| Sorting |
Sorting Asc 1 column |
300 |
13.47 |
15.93 |
-2.46 |
| Sorting |
Sorting Asc 1 column |
500 |
9.66 |
16.06 |
-6.41 |
| Sorting |
Sorting Asc 10 column |
10 |
10.60 |
11.73 |
-1.14 |
| Sorting |
Sorting Asc 10 column |
100 |
26.82 |
37.33 |
-10.51 |
| Sorting |
Sorting Asc 10 column |
200 |
12.53 |
16.26 |
-3.73 |
| Sorting |
Sorting Asc 10 column |
300 |
13.04 |
16.53 |
-3.50 |
| Sorting |
Sorting Asc 10 column |
500 |
9.40 |
16.20 |
-6.80 |
| Sorting |
Sorting Asc 5 column |
10 |
12.52 |
13.89 |
-1.38 |
| Sorting |
Sorting Asc 5 column |
100 |
26.48 |
41.35 |
-14.87 |
| Sorting |
Sorting Asc 5 column |
200 |
12.93 |
15.40 |
-2.48 |
| Sorting |
Sorting Asc 5 column |
300 |
13.49 |
14.76 |
-1.27 |
| Sorting |
Sorting Asc 5 column |
500 |
10.53 |
16.00 |
-5.47 |
| Sorting |
Sorting Desc 1 column |
10 |
12.53 |
14.45 |
-1.92 |
| Sorting |
Sorting Desc 1 column |
100 |
26.93 |
43.23 |
-16.30 |
| Sorting |
Sorting Desc 1 column |
200 |
12.85 |
17.05 |
-4.20 |
| Sorting |
Sorting Desc 1 column |
300 |
13.29 |
15.00 |
-1.71 |
| Sorting |
Sorting Desc 1 column |
500 |
8.92 |
16.30 |
-7.38 |
| Sorting |
Sorting Desc 10 column |
10 |
11.98 |
13.01 |
-1.03 |
| Sorting |
Sorting Desc 10 column |
100 |
26.63 |
40.75 |
-14.12 |
| Sorting |
Sorting Desc 10 column |
200 |
12.52 |
15.72 |
-3.20 |
| Sorting |
Sorting Desc 10 column |
300 |
13.05 |
15.84 |
-2.79 |
| Sorting |
Sorting Desc 10 column |
500 |
10.25 |
15.88 |
-5.63 |
| Sorting |
Sorting Desc 5 column |
10 |
12.46 |
12.42 |
0.05 |
| Sorting |
Sorting Desc 5 column |
100 |
25.50 |
40.45 |
-14.96 |
| Sorting |
Sorting Desc 5 column |
200 |
12.59 |
16.21 |
-3.62 |
| Sorting |
Sorting Desc 5 column |
300 |
13.13 |
16.18 |
-3.05 |
| Sorting |
Sorting Desc 5 column |
500 |
10.02 |
16.04 |
-6.02 |
| Splitting |
Split 1 Column into 10 |
10 |
13.23 |
16.59 |
-3.36 |
| Splitting |
Split 1 Column into 10 |
100 |
27.76 |
50.35 |
-22.58 |
| Splitting |
Split 1 Column into 10 |
200 |
27.19 |
48.57 |
-21.38 |
| Splitting |
Split 1 Column into 10 |
300 |
27.43 |
47.79 |
-20.36 |
| Splitting |
Split 1 Column into 10 |
500 |
26.69 |
47.23 |
-20.54 |
| Splitting |
Split 1 Column into 5 |
10 |
15.49 |
18.61 |
-3.12 |
| Splitting |
Split 1 Column into 5 |
100 |
30.23 |
52.56 |
-22.34 |
| Splitting |
Split 1 Column into 5 |
200 |
30.10 |
51.75 |
-21.65 |
| Splitting |
Split 1 Column into 5 |
300 |
31.56 |
52.63 |
-21.07 |
| Splitting |
Split 1 Column into 5 |
500 |
30.19 |
48.15 |
-17.96 |



## Warning in wilcox.test.default(x = c(43.9746927716948, 43.3457651552285, :
## cannot compute exact p-value with ties
## No Significant Elements






Comparing Aggregate operations
Scala is faster than PySpark
Aggregate Operations - All Environments:
| Grouping |
GroupBy 1 column |
10 |
1.30 |
1.56 |
-0.27 |
| Grouping |
GroupBy 1 column |
100 |
7.97 |
8.65 |
-0.68 |
| Grouping |
GroupBy 1 column |
200 |
11.04 |
11.45 |
-0.40 |
| Grouping |
GroupBy 1 column |
300 |
11.30 |
13.10 |
-1.80 |
| Grouping |
GroupBy 1 column |
500 |
14.32 |
13.41 |
0.91 |
| Grouping |
GroupBy 10 columns |
10 |
0.87 |
1.08 |
-0.21 |
| Grouping |
GroupBy 10 columns |
100 |
3.40 |
4.63 |
-1.23 |
| Grouping |
GroupBy 10 columns |
200 |
4.17 |
6.40 |
-2.23 |
| Grouping |
GroupBy 10 columns |
300 |
4.15 |
6.85 |
-2.70 |
| Grouping |
GroupBy 10 columns |
500 |
3.06 |
3.84 |
-0.78 |
| Grouping |
GroupBy 5 columns |
10 |
0.92 |
1.10 |
-0.18 |
| Grouping |
GroupBy 5 columns |
100 |
4.51 |
5.43 |
-0.93 |
| Grouping |
GroupBy 5 columns |
200 |
6.14 |
7.72 |
-1.59 |
| Grouping |
GroupBy 5 columns |
300 |
6.67 |
8.83 |
-2.15 |
| Grouping |
GroupBy 5 columns |
500 |
8.81 |
9.42 |
-0.61 |
| Ranking |
Ranking by Group |
10 |
5.79 |
7.01 |
-1.22 |
| Ranking |
Ranking by Group |
100 |
16.66 |
24.20 |
-7.54 |
| Ranking |
Ranking by Group |
200 |
15.42 |
22.58 |
-7.16 |
| Ranking |
Ranking by Group |
300 |
13.88 |
21.77 |
-7.89 |
| Ranking |
Ranking by Group |
500 |
13.40 |
20.19 |
-6.79 |



Aggregate Operations - Setup: Databricks
| Grouping |
GroupBy 1 column |
10 |
0.73 |
1.08 |
-0.35 |
| Grouping |
GroupBy 1 column |
100 |
8.12 |
7.39 |
0.73 |
| Grouping |
GroupBy 1 column |
200 |
12.03 |
11.49 |
0.54 |
| Grouping |
GroupBy 1 column |
300 |
12.30 |
13.54 |
-1.24 |
| Grouping |
GroupBy 1 column |
500 |
17.27 |
13.56 |
3.70 |
| Grouping |
GroupBy 10 columns |
10 |
0.58 |
0.85 |
-0.27 |
| Grouping |
GroupBy 10 columns |
100 |
3.55 |
4.07 |
-0.52 |
| Grouping |
GroupBy 10 columns |
200 |
4.52 |
5.95 |
-1.43 |
| Grouping |
GroupBy 10 columns |
300 |
4.58 |
6.66 |
-2.08 |
| Grouping |
GroupBy 10 columns |
500 |
3.16 |
2.83 |
0.32 |
| Grouping |
GroupBy 5 columns |
10 |
0.62 |
0.83 |
-0.21 |
| Grouping |
GroupBy 5 columns |
100 |
4.49 |
4.87 |
-0.38 |
| Grouping |
GroupBy 5 columns |
200 |
6.21 |
7.43 |
-1.22 |
| Grouping |
GroupBy 5 columns |
300 |
6.94 |
8.55 |
-1.62 |
| Grouping |
GroupBy 5 columns |
500 |
10.03 |
8.79 |
1.24 |
| Ranking |
Ranking by Group |
10 |
4.14 |
6.20 |
-2.06 |
| Ranking |
Ranking by Group |
100 |
17.25 |
23.41 |
-6.15 |
| Ranking |
Ranking by Group |
200 |
15.94 |
19.02 |
-3.08 |
| Ranking |
Ranking by Group |
300 |
12.91 |
18.37 |
-5.46 |
| Ranking |
Ranking by Group |
500 |
13.00 |
16.05 |
-3.05 |

## No Significant Elements

Aggregate Operations - Setup: Local VM
| Grouping |
GroupBy 1 column |
10 |
0.79 |
1.34 |
-0.55 |
| Grouping |
GroupBy 1 column |
100 |
2.87 |
5.44 |
-2.57 |
| Grouping |
GroupBy 1 column |
200 |
3.45 |
5.89 |
-2.44 |
| Grouping |
GroupBy 1 column |
300 |
2.37 |
5.85 |
-3.48 |
| Grouping |
GroupBy 1 column |
500 |
3.48 |
5.39 |
-1.90 |
| Grouping |
GroupBy 10 columns |
10 |
0.52 |
0.77 |
-0.26 |
| Grouping |
GroupBy 10 columns |
100 |
1.46 |
2.65 |
-1.19 |
| Grouping |
GroupBy 10 columns |
200 |
1.66 |
3.43 |
-1.77 |
| Grouping |
GroupBy 10 columns |
300 |
1.24 |
2.94 |
-1.70 |
| Grouping |
GroupBy 10 columns |
500 |
1.40 |
2.73 |
-1.33 |
| Grouping |
GroupBy 5 columns |
10 |
0.57 |
0.76 |
-0.19 |
| Grouping |
GroupBy 5 columns |
100 |
1.93 |
3.06 |
-1.13 |
| Grouping |
GroupBy 5 columns |
200 |
2.35 |
3.97 |
-1.61 |
| Grouping |
GroupBy 5 columns |
300 |
1.83 |
4.06 |
-2.23 |
| Grouping |
GroupBy 5 columns |
500 |
2.38 |
3.81 |
-1.43 |
| Ranking |
Ranking by Group |
10 |
3.55 |
4.54 |
-0.98 |
| Ranking |
Ranking by Group |
100 |
10.16 |
14.68 |
-4.52 |
| Ranking |
Ranking by Group |
200 |
8.14 |
14.22 |
-6.07 |
| Ranking |
Ranking by Group |
300 |
7.21 |
12.12 |
-4.91 |
| Ranking |
Ranking by Group |
500 |
5.86 |
13.34 |
-7.48 |



Aggregate Operations - Setup: Cluster
| Grouping |
GroupBy 1 column |
10 |
3.44 |
2.99 |
0.44 |
| Grouping |
GroupBy 1 column |
100 |
17.88 |
17.61 |
0.27 |
| Grouping |
GroupBy 1 column |
200 |
24.25 |
22.46 |
1.79 |
| Grouping |
GroupBy 1 column |
300 |
27.39 |
26.73 |
0.66 |
| Grouping |
GroupBy 1 column |
500 |
30.10 |
29.13 |
0.97 |
| Grouping |
GroupBy 10 columns |
10 |
2.15 |
2.14 |
0.01 |
| Grouping |
GroupBy 10 columns |
100 |
6.98 |
9.69 |
-2.70 |
| Grouping |
GroupBy 10 columns |
200 |
8.50 |
13.25 |
-4.75 |
| Grouping |
GroupBy 10 columns |
300 |
9.10 |
15.03 |
-5.93 |
| Grouping |
GroupBy 10 columns |
500 |
6.19 |
8.08 |
-1.88 |
| Grouping |
GroupBy 5 columns |
10 |
2.23 |
2.29 |
-0.07 |
| Grouping |
GroupBy 5 columns |
100 |
9.68 |
11.32 |
-1.64 |
| Grouping |
GroupBy 5 columns |
200 |
13.55 |
15.82 |
-2.27 |
| Grouping |
GroupBy 5 columns |
300 |
15.89 |
18.90 |
-3.01 |
| Grouping |
GroupBy 5 columns |
500 |
19.24 |
21.91 |
-2.67 |
| Ranking |
Ranking by Group |
10 |
13.56 |
13.58 |
-0.02 |
| Ranking |
Ranking by Group |
100 |
28.46 |
44.82 |
-16.35 |
| Ranking |
Ranking by Group |
200 |
28.93 |
46.44 |
-17.51 |
| Ranking |
Ranking by Group |
300 |
29.15 |
47.86 |
-18.71 |
| Ranking |
Ranking by Group |
500 |
29.29 |
42.18 |
-12.88 |




Comparing Mixed operations
Scala is faster than PySpark
Mixed Operations - All Environments:
| Pivots |
Pivot 1 Rows and 1 Column |
10 |
4.42 |
4.87 |
-0.44 |
| Pivots |
Pivot 1 Rows and 1 Column |
100 |
33.09 |
31.88 |
1.21 |
| Pivots |
Pivot 1 Rows and 1 Column |
200 |
47.27 |
46.44 |
0.83 |
| Pivots |
Pivot 1 Rows and 1 Column |
300 |
47.36 |
49.58 |
-2.22 |
| Pivots |
Pivot 1 Rows and 1 Column |
500 |
59.54 |
55.97 |
3.57 |
| Pivots |
Pivot 10 Rows and 1 Column |
10 |
2.15 |
2.57 |
-0.42 |
| Pivots |
Pivot 10 Rows and 1 Column |
100 |
11.83 |
12.59 |
-0.76 |
| Pivots |
Pivot 10 Rows and 1 Column |
200 |
17.40 |
19.51 |
-2.10 |
| Pivots |
Pivot 10 Rows and 1 Column |
300 |
18.29 |
23.15 |
-4.86 |
| Pivots |
Pivot 10 Rows and 1 Column |
500 |
23.10 |
29.14 |
-6.04 |
| Pivots |
Pivot 5 Rows and 1 Column |
10 |
2.37 |
2.84 |
-0.47 |
| Pivots |
Pivot 5 Rows and 1 Column |
100 |
13.30 |
13.71 |
-0.40 |
| Pivots |
Pivot 5 Rows and 1 Column |
200 |
21.63 |
22.50 |
-0.87 |
| Pivots |
Pivot 5 Rows and 1 Column |
300 |
24.56 |
29.58 |
-5.02 |
| Pivots |
Pivot 5 Rows and 1 Column |
500 |
32.85 |
33.67 |
-0.82 |

## No Significant Elements
Mixed Operations - Setup: Databricks
| Pivots |
Pivot 1 Rows and 1 Column |
10 |
2.57 |
3.06 |
-0.49 |
| Pivots |
Pivot 1 Rows and 1 Column |
100 |
29.73 |
24.38 |
5.36 |
| Pivots |
Pivot 1 Rows and 1 Column |
200 |
45.50 |
41.06 |
4.44 |
| Pivots |
Pivot 1 Rows and 1 Column |
300 |
42.56 |
46.00 |
-3.43 |
| Pivots |
Pivot 1 Rows and 1 Column |
500 |
52.25 |
48.85 |
3.40 |
| Pivots |
Pivot 10 Rows and 1 Column |
10 |
1.41 |
1.95 |
-0.54 |
| Pivots |
Pivot 10 Rows and 1 Column |
100 |
10.25 |
9.22 |
1.02 |
| Pivots |
Pivot 10 Rows and 1 Column |
200 |
15.68 |
13.90 |
1.78 |
| Pivots |
Pivot 10 Rows and 1 Column |
300 |
16.40 |
18.99 |
-2.60 |
| Pivots |
Pivot 10 Rows and 1 Column |
500 |
21.18 |
22.55 |
-1.38 |
| Pivots |
Pivot 5 Rows and 1 Column |
10 |
1.55 |
2.16 |
-0.60 |
| Pivots |
Pivot 5 Rows and 1 Column |
100 |
11.28 |
10.00 |
1.28 |
| Pivots |
Pivot 5 Rows and 1 Column |
200 |
17.79 |
16.44 |
1.35 |
| Pivots |
Pivot 5 Rows and 1 Column |
300 |
20.73 |
22.13 |
-1.41 |
| Pivots |
Pivot 5 Rows and 1 Column |
500 |
29.20 |
25.17 |
4.03 |

## No Significant Elements
Mixed Operations - Setup: Local VM
| Pivots |
Pivot 1 Rows and 1 Column |
10 |
3.64 |
4.81 |
-1.17 |
| Pivots |
Pivot 1 Rows and 1 Column |
100 |
23.76 |
27.31 |
-3.55 |
| Pivots |
Pivot 1 Rows and 1 Column |
200 |
32.26 |
33.57 |
-1.31 |
| Pivots |
Pivot 1 Rows and 1 Column |
300 |
24.67 |
34.97 |
-10.31 |
| Pivots |
Pivot 1 Rows and 1 Column |
500 |
39.40 |
41.93 |
-2.53 |
| Pivots |
Pivot 10 Rows and 1 Column |
10 |
1.33 |
1.93 |
-0.60 |
| Pivots |
Pivot 10 Rows and 1 Column |
100 |
6.97 |
9.01 |
-2.04 |
| Pivots |
Pivot 10 Rows and 1 Column |
200 |
10.20 |
14.85 |
-4.64 |
| Pivots |
Pivot 10 Rows and 1 Column |
300 |
8.60 |
12.78 |
-4.18 |
| Pivots |
Pivot 10 Rows and 1 Column |
500 |
12.36 |
19.47 |
-7.11 |
| Pivots |
Pivot 5 Rows and 1 Column |
10 |
1.53 |
2.14 |
-0.61 |
| Pivots |
Pivot 5 Rows and 1 Column |
100 |
7.73 |
10.19 |
-2.47 |
| Pivots |
Pivot 5 Rows and 1 Column |
200 |
13.40 |
15.88 |
-2.48 |
| Pivots |
Pivot 5 Rows and 1 Column |
300 |
11.82 |
21.80 |
-9.98 |
| Pivots |
Pivot 5 Rows and 1 Column |
500 |
16.65 |
23.62 |
-6.97 |

Mixed Operations - Setup: Cluster
| Pivots |
Pivot 1 Rows and 1 Column |
10 |
9.69 |
8.59 |
1.10 |
| Pivots |
Pivot 1 Rows and 1 Column |
100 |
58.46 |
56.04 |
2.42 |
| Pivots |
Pivot 1 Rows and 1 Column |
200 |
80.83 |
82.92 |
-2.10 |
| Pivots |
Pivot 1 Rows and 1 Column |
300 |
102.34 |
85.94 |
16.39 |
| Pivots |
Pivot 1 Rows and 1 Column |
500 |
114.39 |
98.28 |
16.11 |
| Pivots |
Pivot 10 Rows and 1 Column |
10 |
5.28 |
5.09 |
0.19 |
| Pivots |
Pivot 10 Rows and 1 Column |
100 |
24.73 |
26.48 |
-1.75 |
| Pivots |
Pivot 10 Rows and 1 Column |
200 |
35.26 |
40.03 |
-4.77 |
| Pivots |
Pivot 10 Rows and 1 Column |
300 |
41.43 |
52.20 |
-10.77 |
| Pivots |
Pivot 10 Rows and 1 Column |
500 |
48.41 |
61.63 |
-13.22 |
| Pivots |
Pivot 5 Rows and 1 Column |
10 |
5.68 |
5.61 |
0.06 |
| Pivots |
Pivot 5 Rows and 1 Column |
100 |
28.50 |
28.15 |
0.35 |
| Pivots |
Pivot 5 Rows and 1 Column |
200 |
45.74 |
47.83 |
-2.09 |
| Pivots |
Pivot 5 Rows and 1 Column |
300 |
56.95 |
60.01 |
-3.06 |
| Pivots |
Pivot 5 Rows and 1 Column |
500 |
72.58 |
70.78 |
1.79 |

## No Significant Elements
Comparing Runs operations
slight improvement with additonal runs (not statistically significant)
filtered=data_raw
filtered$Operation = forcats::fct_reorder(filtered$RunID,filtered[[targetVar]],.fun=median,desc=F)
## Warning: Some components of ... were not used: desc
p=ggplot(data=filtered, aes_string(x='RunID',y=targetVar,fill='Language',color='Language')) +
geom_boxplot(outlier.size = 0.5,size=0.5,alpha=0.6,outlier.shape = outliersShape) +
scale_y_continuous(limits = c(0, quantile(filtered[[targetVar]],probs=outliersQuantile))
,labels=scales::comma_format(accuracy=1,suffix=targetUnits))
ggplot_colors(p)
## Warning: Removed 910 rows containing non-finite values (stat_boxplot).

By Setup
p=ggplot(data=filtered, aes_string(x='RunID',y=targetVar,fill='Language',color='Language')) +
geom_boxplot(outlier.size = 0.5,size=0.5,alpha=0.6,outlier.shape = outliersShape) +
scale_y_continuous(limits = c(0, quantile(filtered[[targetVar]],probs=outliersQuantile))
,labels=scales::comma_format(accuracy=1,suffix=targetUnits)) +
facet_wrap(~Setup,ncol=1)
ggplot_colors(p)
## Warning: Removed 910 rows containing non-finite values (stat_boxplot).
